Lab 11

Author

Sylwia Lipior

1. Read in the data

# Load the necessary libraries
library(data.table)

# Load COVID state-level data from NYT
cv_states <- fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")

# Load state population data
state_pops <- fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv")

# Rename the columns for merging
setnames(state_pops, old = "state", new = "abb")
setnames(state_pops, old = "state_name", new = "state")

# Merge the datasets
cv_states <- merge(cv_states, state_pops, by = "state")

2. Look at the data

dim(cv_states)
[1] 58094     9
head(cv_states)
     state       date fips cases deaths abb geo_id population pop_density
1: Alabama 2020-03-13    1     6      0  AL      1    4887871    96.50939
2: Alabama 2020-03-14    1    12      0  AL      1    4887871    96.50939
3: Alabama 2020-03-15    1    23      0  AL      1    4887871    96.50939
4: Alabama 2020-03-16    1    29      0  AL      1    4887871    96.50939
5: Alabama 2020-03-17    1    39      0  AL      1    4887871    96.50939
6: Alabama 2020-03-18    1    51      0  AL      1    4887871    96.50939
tail(cv_states)
     state       date fips  cases deaths abb geo_id population pop_density
1: Wyoming 2023-03-18   56 185640   2009  WY     56     577737    5.950611
2: Wyoming 2023-03-19   56 185640   2009  WY     56     577737    5.950611
3: Wyoming 2023-03-20   56 185640   2009  WY     56     577737    5.950611
4: Wyoming 2023-03-21   56 185800   2014  WY     56     577737    5.950611
5: Wyoming 2023-03-22   56 185800   2014  WY     56     577737    5.950611
6: Wyoming 2023-03-23   56 185800   2014  WY     56     577737    5.950611
str(cv_states)
Classes 'data.table' and 'data.frame':  58094 obs. of  9 variables:
 $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
 $ date       : IDate, format: "2020-03-13" "2020-03-14" ...
 $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
 $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ abb        : chr  "AL" "AL" "AL" "AL" ...
 $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 - attr(*, ".internal.selfref")=<externalptr> 
 - attr(*, "sorted")= chr "state"

3. Format the data

# Format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")

# Format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)

# Order the data first by state, second by date
cv_states <- cv_states[order(cv_states$state, cv_states$date),]

# Confirm the variables are now correctly formatted
str(cv_states)
Classes 'data.table' and 'data.frame':  58094 obs. of  9 variables:
 $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ date       : Date, format: "2020-03-13" "2020-03-14" ...
 $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
 $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 - attr(*, ".internal.selfref")=<externalptr> 
head(cv_states)
     state       date fips cases deaths abb geo_id population pop_density
1: Alabama 2020-03-13    1     6      0  AL      1    4887871    96.50939
2: Alabama 2020-03-14    1    12      0  AL      1    4887871    96.50939
3: Alabama 2020-03-15    1    23      0  AL      1    4887871    96.50939
4: Alabama 2020-03-16    1    29      0  AL      1    4887871    96.50939
5: Alabama 2020-03-17    1    39      0  AL      1    4887871    96.50939
6: Alabama 2020-03-18    1    51      0  AL      1    4887871    96.50939
tail(cv_states)
     state       date fips  cases deaths abb geo_id population pop_density
1: Wyoming 2023-03-18   56 185640   2009  WY     56     577737    5.950611
2: Wyoming 2023-03-19   56 185640   2009  WY     56     577737    5.950611
3: Wyoming 2023-03-20   56 185640   2009  WY     56     577737    5.950611
4: Wyoming 2023-03-21   56 185800   2014  WY     56     577737    5.950611
5: Wyoming 2023-03-22   56 185800   2014  WY     56     577737    5.950611
6: Wyoming 2023-03-23   56 185800   2014  WY     56     577737    5.950611
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
summary(cv_states)
           state            date                 fips           cases         
 Washington   : 1158   Min.   :2020-01-21   Min.   : 1.00   Min.   :       1  
 Illinois     : 1155   1st Qu.:2020-12-06   1st Qu.:16.00   1st Qu.:  112125  
 California   : 1154   Median :2021-09-11   Median :29.00   Median :  418120  
 Arizona      : 1153   Mean   :2021-09-10   Mean   :29.78   Mean   :  947941  
 Massachusetts: 1147   3rd Qu.:2022-06-17   3rd Qu.:44.00   3rd Qu.: 1134318  
 Wisconsin    : 1143   Max.   :2023-03-23   Max.   :72.00   Max.   :12169158  
 (Other)      :51184                                                          
     deaths            abb            geo_id        population      
 Min.   :     0   WA     : 1158   Min.   : 1.00   Min.   :  577737  
 1st Qu.:  1598   IL     : 1155   1st Qu.:16.00   1st Qu.: 1805832  
 Median :  5901   CA     : 1154   Median :29.00   Median : 4468402  
 Mean   : 12553   AZ     : 1153   Mean   :29.78   Mean   : 6397965  
 3rd Qu.: 15952   MA     : 1147   3rd Qu.:44.00   3rd Qu.: 7535591  
 Max.   :104277   WI     : 1143   Max.   :72.00   Max.   :39557045  
                  (Other):51184                                     
  pop_density       
 Min.   :    1.292  
 1st Qu.:   43.659  
 Median :  107.860  
 Mean   :  423.031  
 3rd Qu.:  229.511  
 Max.   :11490.120  
 NA's   :1106       
min(cv_states$date)
[1] "2020-01-21"
max(cv_states$date)
[1] "2023-03-23"

4. Add new_cases and new_deaths and correct outliers

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:data.table':

    between, first, last
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(zoo)

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
library(plotly)
Loading required package: ggplot2

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(ggplot2)
library(htmlwidgets)

# Add variables for new_cases and new_deaths
for (i in 1:length(state_list)) {
  cv_subset <- subset(cv_states, state == state_list[i])
  cv_subset <- cv_subset[order(cv_subset$date),]

  # Add starting level for new cases and deaths
  cv_subset$new_cases <- cv_subset$cases[1]
  cv_subset$new_deaths <- cv_subset$deaths[1]

  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] <- cv_subset$cases[j] - cv_subset$cases[j-1]
    cv_subset$new_deaths[j] <- cv_subset$deaths[j] - cv_subset$deaths[j-1]
  }

  # Include in the main dataset
  cv_states$new_cases[cv_states$state==state_list[i]] <- cv_subset$new_cases
  cv_states$new_deaths[cv_states$state==state_list[i]] <- cv_subset$new_deaths
}

# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2021-06-01")

# Inspect outliers in new_cases using plotly
p1 <- ggplot(cv_states, aes(x = date, y = new_cases, color = state)) +
  geom_point(size = .5, alpha = 0.5) +
  labs(title = "New Cases by Date and State") +
  theme_minimal()
ggplotly(p1)
p2 <- ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
  geom_point(size = .5, alpha = 0.5) +
  labs(title = "New Deaths by Date and State") +
  theme_minimal()
ggplotly(p2)
# Set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases < 0] <- 0
cv_states$new_deaths[cv_states$new_deaths < 0] <- 0

# Recalculate cases and deaths as the cumulative sum of updated new_cases and new_deaths
for (i in 1:length(state_list)) {
  cv_subset <- subset(cv_states, state == state_list[i])

  # Add starting level for new cases and deaths
  cv_subset$cases <- cv_subset$cases[1]
  cv_subset$deaths <- cv_subset$deaths[1]

  for (j in 2:nrow(cv_subset)) {
    cv_subset$cases[j] <- cv_subset$new_cases[j] + cv_subset$cases[j-1]
    cv_subset$deaths[j] <- cv_subset$new_deaths[j] + cv_subset$deaths[j-1]
  }
  
  # Include in the main dataset
  cv_states$cases[cv_states$state==state_list[i]] <- cv_subset$cases
  cv_states$deaths[cv_states$state==state_list[i]] <- cv_subset$deaths
}

# Smooth new counts
cv_states$new_cases <- zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths <- zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)

# Inspect data again interactively
p2 <- ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
  geom_line() + geom_point(size = .5, alpha = 0.5) +
  labs(title = "New Deaths by Date and State") +
  theme_minimal()
ggplotly(p2)

5. Add additional variables

# Add population-normalized (per 100,000) variables
cv_states$per100k = as.numeric(format(round(cv_states$cases / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases / (cv_states$population / 100000), 1), nsmall = 1))
Warning: NAs introduced by coercion
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths / (cv_states$population / 100000), 1), nsmall = 1))

# Check for missing or NA values in new_deaths or population
missing_values <- is.na(cv_states$new_deaths) | is.na(cv_states$population)

# Calculate newdeathsper100k, but set it to NA for rows with missing values
cv_states$newdeathsper100k <- ifelse(missing_values, NA, as.numeric(format(round(cv_states$new_deaths / (cv_states$population / 100000), 1), nsmall = 1)))
Warning in ifelse(missing_values, NA,
as.numeric(format(round(cv_states$new_deaths/(cv_states$population/1e+05), :
NAs introduced by coercion
# Add a naive CFR (Case Fatality Rate) variable
cv_states$naive_CFR = round((cv_states$deaths * 100 / cv_states$cases), 2)

# Create a cv_states_today dataframe representing values on the most recent date
cv_states_today = subset(cv_states, date == max(cv_states$date))

6. Explore scatterplots

# pop_density vs. cases
scatterplot_cases <- cv_states_today %>%
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5)) %>%
  layout(title = "COVID-19 Cases vs. Population Density for US States",
         yaxis = list(title = "Cases"), xaxis = list(title = "Population Density"),
         hovermode = "compare")

# Filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state != "District of Columbia")

# pop_density vs. cases after filtering
scatterplot_cases_filtered <- cv_states_today_filter %>%
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5)) %>%
  layout(title = "COVID-19 Cases vs. Population Density for US States (Filtered)",
         yaxis = list(title = "Cases"), xaxis = list(title = "Population Density"),
         hovermode = "compare")

# pop_density vs. deathsper100k
scatterplot_deathsper100k <- cv_states_today_filter %>%
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5)) %>%
  layout(title = "Population-normalized COVID-19 Deaths (per 100k) vs. Population Density for US States",
         yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
         hovermode = "compare")

# Adding hoverinfo
scatterplot_deathsper100k <- scatterplot_deathsper100k %>%
  add_trace(text = ~paste(state, ":<br>Cases per 100k: ", per100k, "<br>Deaths per 100k: ", deathsper100k)) 

# Display the scatterplot
scatterplot_deathsper100k
Warning: Ignoring 1 observations

Warning: Ignoring 1 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

7. Explore scatterplot trend interactively using ggplotly() and geom_smooth()

# Scatterplot trend for pop_density vs. newdeathsper100k
p <- ggplot(cv_states_today_filter, aes(x = pop_density, y = newdeathsper100k, size = population)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Adding a linear regression trend line
  labs(title = "Population-normalized New Deaths (per 100k) vs. Population Density",
       x = "Population Density",
       y = "New Deaths per 100k") +
  scale_size_continuous(name = "Population", breaks = c(1e6, 5e6, 1e7, 2e7), labels = c("1M", "5M", "10M", "20M")) +
  theme_minimal()  # You can customize the theme as needed

# Convert the ggplot plot to an interactive plot with ggplotly
p <- ggplotly(p)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: The following aesthetics were dropped during statistical transformation: size
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?
# Display the interactive plot
p

8. Multiple line chart

# Line chart for naive_CFR for all states over time using plot_ly()
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines") %>%
  layout(title = "Naive CFR Over Time for All States",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Naive CFR"),
         showlegend = TRUE)
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>%
  filter(state == "Florida") %>%
  plot_ly(x = ~date, type = "scatter", mode = "lines", name = "New Cases", y = ~new_cases) %>%
  add_trace(x = ~date, y = ~new_deaths, name = "New Deaths") %>%
  layout(title = "New Cases and New Deaths Over Time in Florida",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Count"),
         showlegend = TRUE)

9. Heatmaps

# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% 
  select(state, date, new_cases) %>% 
  dplyr::filter(date > as.Date("2021-06-01"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
heatmap_cases <- plot_ly(
  x = colnames(cv_states_mat2), 
  y = rownames(cv_states_mat2),
  z = ~cv_states_mat2,
  type = "heatmap",
  showscale = TRUE
) %>%
  layout(
    title = "New Cases Heatmap",
    xaxis = list(title = "State"),
    yaxis = list(title = "Date")
  )

heatmap_cases
# Repeat with newper100k
cv_states_mat <- cv_states %>% 
  select(state, date, newper100k) %>% 
  dplyr::filter(date > as.Date("2021-06-01"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

heatmap_per100k <- plot_ly(
  x = colnames(cv_states_mat2), 
  y = rownames(cv_states_mat2),
  z = ~cv_states_mat2,
  type = "heatmap",
  showscale = TRUE
) %>%
  layout(
    title = "New Cases per 100k Heatmap",
    xaxis = list(title = "State"),
    yaxis = list(title = "Date")
  )

heatmap_per100k

10. Maps

library(plotly)

# For specified date
pick.date <- "2021-10-15"

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% 
  filter(date == pick.date) %>% 
  select(state, abb, naive_CFR) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Naive CFR: ", naive_CFR))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Make sure both maps are on the same color scale
shadeLimit <- 5

# Create the map for the specified date
fig_pick.date <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~naive_CFR, text = ~hover, locations = ~state,
    color = ~naive_CFR, colors = 'Purples'
  )
fig_pick.date <- fig_pick.date %>% colorbar(title = paste0("Naive CFR: ", pick.date), limits = c(0, shadeLimit))
fig_pick.date <- fig_pick.date %>% layout(
  title = paste('Naive CFR by State as of', pick.date, '<br>(Hover for value)'),
  geo = set_map_details
)

#############
### Map for today's date

# Extract the data for each state by its abbreviation
cv_per100_today <- cv_states_today %>% 
  select(state, abb, naive_CFR) # select data
cv_per100_today$state_name <- cv_per100_today$state
cv_per100_today$state <- cv_per100_today$abb
cv_per100_today$abb <- NULL

# Create hover text
cv_per100_today$hover <- with(cv_per100_today, paste(state_name, '<br>', "Naive CFR: ", naive_CFR))

# Create the map for today's date
fig_today <- plot_geo(cv_per100_today, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~naive_CFR, text = ~hover, locations = ~state,
    color = ~naive_CFR, colors = 'Purples'
  )
fig_today <- fig_today %>% colorbar(title = paste0("Naive CFR: ", Sys.Date()), limits = c(0, shadeLimit))
fig_today <- fig_today %>% layout(
  title = paste('Naive CFR by State as of', Sys.Date(), '<br>(Hover for value)'),
  geo = set_map_details
)

# Plot the two maps together
subplot(fig_pick.date, fig_today, nrows = 2, margin = 0.05)